Module 7: PCA Implementation Lab - Marketing Analytics Mission

1Load Marketing Dataset

📋 Context: MegaRetail's customer database contains 10M records with 500+ attributes ranging from demographics to behavioral patterns. Your first task is to load and understand this complex dataset.

data_loading.py

import numpy as np import pandas as pd from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA import matplotlib.pyplot as plt # TODO: Generate synthetic marketing data # Create 10000 customers with 500 features n_customers = ____ n_features = ____ # Generate correlated feature groups (realistic structure) np.random.seed(42) X = np.random.randn(____, ____) # Create correlation within feature groups for i in range(0, n_features, 10): # TODO: Add correlation structure base = ____ X[:, i:i+10] = ____ print(f"Dataset shape: {X.shape}") print(f"Memory usage: {X.nbytes / 1e6:.1f} MB")

💡 Hint: Fill in n_customers=10000, n_features=500. For correlation structure, create a base random vector and add noise to create correlated features within each group of 10.

Output:

0

Data Points

0

Dimensions

0 MB

Memory Usage

2Standardize Features

📋 Critical Step: PCA is sensitive to scale. Features must be standardized (mean=0, std=1) before transformation. Failing this step is the #1 cause of PCA failure in production!

⚠️ Challenge: What happens if you forget to standardize? Test both approaches and observe the variance explained!

standardization.py

# Test impact of standardization from sklearn.preprocessing import StandardScaler # Without standardization pca_raw = PCA(n_components=10) X_raw_pca = pca_raw.fit_transform(____) variance_raw = pca_raw.explained_variance_ratio_.sum() # With standardization scaler = ____ X_scaled = ____ pca_scaled = PCA(n_components=10) X_scaled_pca = ____ variance_scaled = ____ print(f"Variance explained without scaling: {variance_raw:.2%}") print(f"Variance explained with scaling: {variance_scaled:.2%}") print(f"Improvement: {(variance_scaled - variance_raw):.2%}")

💡 Hint: Use StandardScaler() and its fit_transform method on X. Remember to apply PCA to both raw and scaled data for comparison.

3Implement PCA Transformation

📋 Core Task: Implement full PCA pipeline and find the optimal number of components that preserves 95% variance while maximizing compression.

pca_implementation.py

class MarketingPCAOptimizer: def __init__(self, variance_threshold=0.95): self.variance_threshold = ____ self.scaler = StandardScaler() self.pca = None self.n_components_optimal = None def find_optimal_components(self, X): """Find minimum components for variance threshold""" # Standardize X_scaled = ____ # Full PCA to analyze all components pca_full = ____ pca_full.fit(____) # Calculate cumulative variance cumulative_var = ____ # Find optimal n_components self.n_components_optimal = ____ # Calculate compression metrics compression_ratio = ____ storage_savings = ____ return { 'n_components': self.n_components_optimal, 'variance_preserved': cumulative_var[self.n_components_optimal-1], 'compression_ratio': compression_ratio, 'storage_savings_pct': storage_savings } def transform(self, X): """Apply optimized PCA transformation""" # TODO: Implement transformation with optimal components X_scaled = ____ self.pca = ____ X_transformed = ____ return X_transformed # Test the implementation optimizer = MarketingPCAOptimizer(variance_threshold=0.95) results = optimizer.find_optimal_components(X) print(f"Optimal components: {results['n_components']}") print(f"Variance preserved: {results['variance_preserved']:.2%}") print(f"Compression ratio: {results['compression_ratio']:.1f}x") print(f"Storage savings: {results['storage_savings_pct']:.1%}")

💡 Hint: Use np.cumsum() for cumulative variance. Find optimal components with np.argmax(cumulative_var >= threshold) + 1. Compression ratio = original_dims / n_components.

4Business Interpretation

📋 Translation Challenge: Transform abstract mathematical components into actionable business segments. Identify the top contributing features for each principal component.

component_interpretation.py

def interpret_components(pca_model, feature_names, n_components=5): """ Interpret principal components in business terms """ interpretations = [] for i in range(min(n_components, pca_model.n_components_)): # Get component loadings component = pca_model.components_[i] # Find top contributing features (by absolute value) top_indices = np.abs(component).argsort()[-10:][::-1] # Create interpretation interpretation = { 'component': f'PC{i+1}', 'variance_explained': pca_model.explained_variance_ratio_[i], 'top_features': [feature_names[idx] for idx in top_indices[:5]], 'loadings': [component[idx] for idx in top_indices[:5]] } # Business naming based on patterns if i == 0: interpretation['business_name'] = 'Affluent Lifestyle' elif i == 1: interpretation['business_name'] = 'Digital Engagement' elif i == 2: interpretation['business_name'] = 'Price Sensitivity' else: interpretation['business_name'] = f'Pattern {i+1}' interpretations.append(interpretation) return interpretations # Apply interpretation feature_names = [f'feature_{i}' for i in range(n_features)] X_transformed = optimizer.transform(X) interpretations = interpret_components(optimizer.pca, feature_names) for interp in interpretations: print(f"\n{interp['component']}: {interp['business_name']}") print(f"Variance: {interp['variance_explained']:.2%}") print(f"Top features: {', '.join(interp['top_features'][:3])}")

5ML Pipeline Integration

📋 Production Ready: Integrate PCA into a complete ML pipeline. Measure the performance improvement in both accuracy and speed.

ml_pipeline.py

from sklearn.pipeline import Pipeline from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split import time # Create synthetic target variable (customer segment) y = (X[:, 0] + X[:, 50] + np.random.randn(n_customers) * 0.5) > 0 # Split data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Pipeline WITHOUT PCA pipeline_original = Pipeline([ ('scaler', StandardScaler()), ('classifier', RandomForestClassifier(n_estimators=50, random_state=42)) ]) # Pipeline WITH PCA pipeline_pca = Pipeline([ ('scaler', StandardScaler()), ('pca', PCA(n_components=47)), # From our optimization ('classifier', RandomForestClassifier(n_estimators=50, random_state=42)) ]) # Benchmark performance print("Training Original Pipeline...") start = time.time() pipeline_original.fit(X_train, y_train) time_original = time.time() - start score_original = pipeline_original.score(X_test, y_test) print("Training PCA Pipeline...") start = time.time() pipeline_pca.fit(X_train, y_train) time_pca = time.time() - start score_pca = pipeline_pca.score(X_test, y_test) # Results print(f"\n📊 Performance Comparison:") print(f"Original: {score_original:.3f} accuracy in {time_original:.2f}s") print(f"PCA: {score_pca:.3f} accuracy in {time_pca:.2f}s") print(f"Speedup: {time_original/time_pca:.1f}x") print(f"Accuracy gain: {(score_pca - score_original)*100:.1f}%")

0x

Training Speedup

0%

Accuracy Gain

0%

Memory Reduction

6Calculate Business Impact

📋 Executive Summary: Translate technical improvements into dollars. Calculate the total annual value created by your PCA implementation.

business_impact.py

def calculate_business_impact(): """ Calculate comprehensive business value of PCA implementation """ # Marketing campaign metrics annual_marketing_spend = 187_500_000 # $187.5M # Performance improvements from PCA roi_improvement = 0.5 # 50% ROI improvement (1.8x → 2.7x) accuracy_improvement = 0.21 # 62% → 83% accuracy # Calculate revenue impact revenue_increase = annual_marketing_spend * roi_improvement # Cost reductions waste_reduction = annual_marketing_spend * 0.2 * accuracy_improvement # Infrastructure savings cloud_compute_annual = 2_400_000 # $2.4M annual compute_reduction = 0.91 # 500 dims → 47 dims infrastructure_savings = cloud_compute_annual * compute_reduction # Speed improvements enable more experiments experiments_increase = 10 # 10x more A/B tests possible value_per_experiment = 500_000 # Each test worth $500K experimentation_value = experiments_increase * value_per_experiment # Total impact total_impact = (revenue_increase + waste_reduction + infrastructure_savings + experimentation_value) # ROI calculation implementation_cost = 500_000 # $500K for 3-month project roi_multiplier = total_impact / implementation_cost payback_days = (implementation_cost / total_impact) * 365 return { 'revenue_increase': revenue_increase, 'waste_reduction': waste_reduction, 'infrastructure_savings': infrastructure_savings, 'experimentation_value': experimentation_value, 'total_annual_impact': total_impact, 'roi_multiplier': roi_multiplier, 'payback_days': payback_days } # Calculate impact impact = calculate_business_impact() print("💰 BUSINESS IMPACT REPORT") print("=" * 50) print(f"Revenue Increase: ${impact['revenue_increase']/1e6:.1f}M") print(f"Waste Reduction: ${impact['waste_reduction']/1e6:.1f}M") print(f"Infrastructure Savings: ${impact['infrastructure_savings']/1e6:.1f}M") print(f"Experimentation Value: ${impact['experimentation_value']/1e6:.1f}M") print("=" * 50) print(f"TOTAL ANNUAL IMPACT: ${impact['total_annual_impact']/1e6:.1f}M") print(f"ROI: {impact['roi_multiplier']:.0f}x") print(f"Payback Period: {impact['payback_days']:.0f} days")

PCA Implementation Lab